import numpy as np
import plotly.graph_objects as go
from scipy.stats import kurtosis, skew
from scipy.stats import anderson
import random
import statistics
from prettytable import PrettyTable
For a given sequence (File: E. coli K-12 MG 1655, GneBank id: U00096), find the frequency of occurrence of each of the nucleotides A, T, G, C’s, dinucleotides, and trinucleotides. For example for sequence ATGCCG dinucleotides would be AT, TG, GC etc and trinucleotides would be ATG, TGC, GCC etc.
# Reading Input
ecoli = []
with open("./Seq_Data/ecoli_11kb.txt", "r") as f:
f.readline()
ecoli = list(f.readline().strip().upper());
nucleotides = {'A':0, 'G':0, 'C':0, 'T':0}
dinucleotides = {}
trinucleotides = {}
for i in nucleotides:
for j in nucleotides:
dinucleotides[i+j] = 0
for k in nucleotides:
trinucleotides[i+j+k] = 0
# Computing frequency
for i in range(len(ecoli)):
nucleotides[ecoli[i]] = nucleotides[ecoli[i]] + 1
if i+1 < len(ecoli):
dinucleotides[ecoli[i]+ecoli[i+1]] = dinucleotides[ecoli[i]+ecoli[i+1]] + 1
if i+2 < len(ecoli):
trinucleotides[ecoli[i]+ecoli[i+1]+ecoli[i+2]] = trinucleotides[ecoli[i]+ecoli[i+1]+ecoli[i+2]] + 1
print("Nucleotide Frequency: ")
print(nucleotides)
print("Dinucleotide Frequency: ")
print(dinucleotides)
print("Trinucleotide Frequency: ")
print(trinucleotides)
Plot the density of nucleotides in a sequence (File: E. coli K-12 MG 1655). Graphically display n-mer (for n=2 and 3) in the given sequence. (You may use Excel sheet for obtaining the plots).
# Plotting Density functions
fig = go.Figure()
fig.add_trace(go.Bar(x=list(nucleotides.keys()),
y=[x/np.sum(list(nucleotides.values())) for x in list(nucleotides.values())],
marker_color='rgb(55, 83, 109)'
))
fig.update_layout(
title='Density of Nucleotides',
xaxis_title_text="Nucleotide",
yaxis_title_text="Density"
)
fig.show()
fig2 = go.Figure()
fig2.add_trace(go.Bar(x=list(dinucleotides.keys()),
y=[x/np.sum(list(dinucleotides.values())) for x in list(dinucleotides.values())],
marker_color='rgb(55, 83, 109)'
))
fig2.update_layout(
title='Density of Dinucleotides',
xaxis_title_text="Dinucleotide",
yaxis_title_text="Density"
)
fig2.show()
fig3 = go.Figure()
fig3.add_trace(go.Bar(x=list(trinucleotides.keys()),
y=[x/np.sum(list(trinucleotides.values())) for x in list(trinucleotides.values())],
marker_color='rgb(55, 83, 109)'
))
fig3.update_layout(
title='Density of Trinucleotides',
xaxis_title_text="Trinucleotide",
yaxis_title_text="Density"
)
fig3.show()
Consider the output generated by Program 1.
(a) What probability would you assign to an A being followed by a G? Under what assumptions does this seem appropriate?
(b) Given that there is an A at a given position along the E. coli genome, what probability would you assign to it being followed by a G, and why?
(c) Does the E. coli composition data suggest that the event we observe a G at one site is independent of the previous two bases? Explain fully with appropriate data.
(a) P(A being followed by G) = P(AG) = 0.056 (from the ba chart above). This is appropriate only if we assume the occurance of a dinucleotide is independent of occurances of other dinucleotide. Which technically isn't the case.
(b) P(G|A) = P(AG)/P(A) = 0.056/0.2483 = 0.2255
(c) To check if an event of occurance G is indepent of other events, it's enough to show that
P(G|E) = P(G) for any and every event.
P(G) = 0.2714
P(G|A) = 0.2255
P(G|C) = 0.31 which is a lot higher than P(G).
Therefore occurance of G is not independent of occurance or presence of of previous two bases.
Clearly the probability of occurance of G varies depending on if the preceeding base is an A or a C.
For the given sequence (File: E. coli K-12 MG 1655), count number of purines (A,G) in each block (block size of 100,1000, and 10,000bp).
(a) Calculate the mean and standard deviation of the proportion of purines per block, and draw histograms of these numbers.
(b) Compare the results of (a) across the different block sizes and comment.
(c) For each block size calculate fraction of counts within 1, 2 and 3 standard deviations of the mean.
def isPurine(n):
if ecoli[n] == 'A' or ecoli[n] == 'G':
return True
return False
count100 = np.zeros(len(ecoli)-100+1)
count1000 = np.zeros(len(ecoli)-1000+1)
count10000 = np.zeros(len(ecoli)-10000+1)
count = 0;
for i in range(100):
if(isPurine(i)):
count = count + 1
count100[0] = count
for i in range(100,len(ecoli)):
if(isPurine(i-100)):
count = count - 1
if(isPurine(i)):
count = count + 1
count100[i+1-100] = count
print("count100: ",count100)
count = 0;
for i in range(1000):
if(isPurine(i)):
count = count + 1
count1000[0] = count
for i in range(1000,len(ecoli)):
if(isPurine(i-1000)):
count = count - 1
if(isPurine(i)):
count = count + 1
count1000[i+1-1000] = count
print("count1000: ",count1000)
count = 0;
for i in range(10000):
if(isPurine(i)):
count = count + 1
count10000[0] = count
for i in range(10000,len(ecoli)):
if(isPurine(i-10000)):
count = count - 1
if(isPurine(i)):
count = count + 1
count10000[i+1-10000] = count
print("count10000: ",count10000)
# Mean proportion of a block is nothing but the proportion of the block itself
proportion100 = count100/100
proportion1000 = count1000/1000
proportion10000 = count10000/10000
# Standard deviation of each sample is nothing but (pq/N)^0.5
stdev100 = [(proportion100[i]*(1-proportion100[i])/100)**0.5 for i in range(len(count100))]
stdev1000 = [(proportion1000[i]*(1-proportion1000[i])/100)**0.5 for i in range(len(count1000))]
stdev10000 = [(proportion10000[i]*(1-proportion10000[i])/100)**0.5 for i in range(len(count10000))]
# Plotting distribution of proportion means
fig = go.Figure()
fig.add_trace(go.Histogram(
x=proportion100,
histnorm='probability',
marker_color='#091b52',
opacity=0.75
))
fig.update_layout(
title_text='Distribution of proportion means N = 100', # title of plot
xaxis_title_text='proportion', # xaxis label
yaxis_title_text='Probability', # yaxis label
)
fig.show()
fig1 = go.Figure()
fig1.add_trace(go.Histogram(
x=proportion1000,
histnorm='probability',
marker_color='#091b52',
opacity=0.75
))
fig1.update_layout(
title_text='Distribution of proportion means N = 1000', # title of plot
xaxis_title_text='proportion', # xaxis label
yaxis_title_text='Probability', # yaxis label
)
fig1.show()
fig2 = go.Figure()
fig2.add_trace(go.Histogram(
x=proportion10000,
histnorm='probability',
marker_color='#091b52',
opacity=0.75
))
fig2.update_layout(
title_text='Distribution of proportion means N = 10000', # title of plot
xaxis_title_text='proportion', # xaxis label
yaxis_title_text='Probability', # yaxis label
)
fig2.show()
When comparing the three graphs obtained, we can see that as the sample size increases the proportion means seem to converge to 0.5 i.e. half. Also we see that the range of proportion values decrease, in the last case it never happens that there exists a block with less than 0.5.
Theoretically this would mean that the probability of finding a piridine converges to 0.5.
# Plotting distribution of proportion means
fig = go.Figure()
fig.add_trace(go.Histogram(
x=stdev100,
histnorm='probability',
marker_color='#091b52',
opacity=0.75
))
fig.update_layout(
title_text='Distribution of block standard deviations N = 100', # title of plot
xaxis_title_text='standard deviation', # xaxis label
yaxis_title_text='Probability', # yaxis label
)
fig.show()
fig1 = go.Figure()
fig1.add_trace(go.Histogram(
x=stdev1000,
histnorm='probability',
marker_color='#091b52',
opacity=0.75
))
fig1.update_layout(
title_text='Distribution of block standard deviations N = 1000', # title of plot
xaxis_title_text='standard deviation', # xaxis label
yaxis_title_text='Probability', # yaxis label
)
fig1.show()
fig2 = go.Figure()
fig2.add_trace(go.Histogram(
x=stdev10000,
histnorm='probability',
marker_color='#091b52',
opacity=0.75
))
fig2.update_layout(
title_text='Distribution of block standard deviations N = 10000', # title of plot
xaxis_title_text='standard deviation', # xaxis label
yaxis_title_text='Probability', # yaxis label
)
fig2.show()
From the standard deviation graphs, we can see that the value seems to be converging and the range of possible standard deviations are decreasing but not as rampant as that of the mean.
# Calculating sampling distibution means and standard deviations
mean_100 = np.mean(proportion100)
mean_1000 = np.mean(proportion1000)
mean_10000 = np.mean(proportion10000)
stdev_100 = statistics.stdev(proportion100)
stdev_1000 = statistics.stdev(proportion1000)
stdev_10000 = statistics.stdev(proportion10000)
count_100_1 = 0
count_100_2 = 0
count_100_3 = 0
count_1000_1 = 0
count_1000_2 = 0
count_1000_3 = 0
count_10000_1 = 0
count_10000_2 = 0
count_10000_3 = 0
for i in proportion100:
if mean_100 - 3*stdev_100 < i < mean_100 + 3*stdev_100:
count_100_3 = count_100_3 + 1
if mean_100 - 2*stdev_100 < i < mean_100 + 2*stdev_100:
count_100_2 = count_100_2 + 1
if mean_100 - 1*stdev_100 < i < mean_100 + 1*stdev_100:
count_100_1 = count_100_1 + 1
for i in proportion1000:
if mean_1000 - 3*stdev_1000 < i < mean_1000 + 3*stdev_1000:
count_1000_3 = count_1000_3 + 1
if mean_1000 - 2*stdev_1000 < i < mean_1000 + 2*stdev_1000:
count_1000_2 = count_1000_2 + 1
if mean_1000 - 1*stdev_1000 < i < mean_1000 + 1*stdev_1000:
count_1000_1 = count_1000_1 + 1
for i in proportion10000:
if mean_10000 - 3*stdev_10000 < i < mean_10000 + 3*stdev_10000:
count_10000_3 = count_10000_3 + 1
if mean_10000 - 2*stdev_10000 < i < mean_10000 + 2*stdev_10000:
count_10000_2 = count_10000_2 + 1
if mean_10000 - 1*stdev_10000 < i < mean_10000 + 1*stdev_10000:
count_10000_1 = count_10000_1 + 1
print("Block Size: 100 ")
print("Fraction within 1 Stdev: ", count_100_1/len(count100))
print("Fraction within 2 Stdev: ", count_100_2/len(count100))
print("Fraction within 3 Stdev: ", count_100_3/len(count100))
print("Block Size: 1000 ")
print("Fraction within 1 Stdev: ", count_1000_1/len(count1000))
print("Fraction within 2 Stdev: ", count_1000_2/len(count1000))
print("Fraction within 3 Stdev: ", count_1000_3/len(count1000))
print("Block Size: 10000 ")
print("Fraction within 1 Stdev: ", count_10000_1/len(count10000))
print("Fraction within 1 Stdev: ", count_10000_2/len(count10000))
print("Fraction within 1 Stdev: ", count_10000_3/len(count10000))